##################
## Solar Siting matrix inversions
##  for gpu-common (or non-gpu option)

# Uses QR decomposition
# Run with and without SOL_ controls
#---> SOL is the subregion-level utility-scale solar insolation * capacity_t

require(data.table)
require(iterators)
require(foreach)
require(parallel)
require(doSNOW)


WORK = file.path('Z:/user/ajk41/Solar siting/Scripts/JAERE Code Repository/Data JAERE')
WORK.OUT = file.path(WORK, 'intermediate_output')
dir.create(WORK.OUT)

#### Get data 
load(file.path(WORK.OUT, 'usingWorkspace.Rdata'))


#### For v5, reshape the df.split and df.common ####
df.useMYS = split(rbindlist(map(df.split.useMYS, function(x) {
  x$ORISPL = rownames(x)
  return(x)}), idcol = 'INTX'), by='ymn')

df.useMYS = lapply(df.useMYS, function(xx){
  list(df = df.split[xx$ORISPL],
              ym = xx$ym[1],
              ymn = xx$ymn[1])
})

#### jDecompose function ####
jDecompose <- function(x){
  if(class(x)[1]%in%c('data.frame','data.table')) x = x$ym
  x = as.character(x)
  y = strsplit(x = x, split="---")
  z = unlist(strsplit(x=y[[1]][2], split="--"))
  return(data.frame(intx = y[[1]][1],
                    ym = z, stringsAsFactors=F))
} # takes the long ym value, parses out the intx, year-mo for the ym



# Cool. Merge in National NSRDB solar insolation as SOL_XXXX --------------------------------------
WEST.names = gsub('PCA_', 'SOL_', unique(grep('PCA_', names(df.common.intx[['WEST']]), value=T)))
EAST.names = gsub('PCA_', 'SOL_', unique(grep('PCA_', names(df.common.intx[['EAST']]), value=T)))
TRE.names  = gsub('PCA_', 'SOL_', unique(grep('PCA_', names(df.common.intx[['TRE']] ), value=T)))

map.names = rbind(data.frame(INTX = 'W',NAME = WEST.names),
                  data.frame(INTX = 'E',NAME = EAST.names),
                  data.frame(INTX = 'T',NAME = TRE.names))
map.names$pcac = sapply(strsplit( as.character(map.names$NAME),'_[[:alnum:]]{1}'), '[', 2)


NN = readRDS(file.path(WORK, 'NSRDB_siting----2018-10-02.rds'))  # NSRDB requires manually querying and manually downloading NSRDB files and is not coded in R.
  ##--> some pcac (planning control area) mappings are combined below. This combines the solar for each subregion.
  NN[pcac=='MROE',pcac:='MROW']
  NN[pcac=='RFCM',pcac:='RFCW']
  NN[pcac=='NEWE',pcac:='NYUP']
  NN[pcac=='SPSO',pcac:='SPNO']
  NN = merge(NN, map.names, by='pcac', all=F)
  NNN = dcast(NN, formula = dh_e ~ NAME, fun.aggregate = sum,  value.var = 'solar_index', drop=T)


# df.common.intx[['WEST']][,grep('SOL_', names(df.common.intx[['WEST']]), value=T):=NULL]

df.common.intx[['WEST']] = merge(df.common.intx[['WEST']], NNN[,grep('dh_e|_W', names(NNN), value=F), with=F], by = 'dh_e')
df.common.intx[['EAST']] = merge(df.common.intx[['EAST']], NNN[,grep('dh_e|_E', names(NNN), value=F), with=F], by = 'dh_e')
df.common.intx[['TRE']]  = merge(df.common.intx[['TRE']] , NNN[,grep('dh_e|_T', names(NNN), value=F), with=F], by = 'dh_e')

## 
# New: combine down high-variance eGrid subregions (solar and load)
df.common.intx[['EAST']][,c('PCA_EMROW'):=list(PCA_EMROE+PCA_EMROW)]
df.common.intx[['EAST']][,c('PCA_ERFCW'):=list(PCA_ERFCM+PCA_ERFCW)]
df.common.intx[['EAST']][,c('PCA_ENYUP'):=list(PCA_ENEWE+PCA_ENYUP)]
df.common.intx[['EAST']][,c('PCA_ESPNO'):=list(PCA_ESPSO+PCA_ESPNO)]
df.common.intx[['EAST']][,c('PCA_EMROE','PCA_ERFCM',
                            'PCA_ENEWE','PCA_ESPSO'):=NULL]


cl <- snow::makeCluster(rep('localhost',22), type='SOCK')
registerDoSNOW(cl)


## Note: this was originally run on a GPU cluster. It has been re-coded to work in both GPU clusters or "traditional" clusters.
## There are now faster methods for doing QR decompositions over multiple cores. They are not employed here as they were not available at the time
#   the analysis for this paper was done.

RES = foreach(df.iter = iter(df.useMYS, by='cell'), .errorhandling = 'pass', .inorder = F,
              .packages = c('data.table'), 
              .noexport = c('qr','wc','wdfc','wdfc.mm.g','wdfc.mm','why','cf','coef.results','Xs'),
              .verbose=T) %dopar% {
  hasSOL = F  # update later with conditional 
  jd = jDecompose(df.iter[['ym']])


  wdfc = df.common.intx[[jd$intx[1]]][ym%in%jd$ym,][,SOL_WSPSO:=NULL]
  wdfc[is.na(wdfc)] = 0
  wdfc.PCAs = grep(names(wdfc), pattern='PCA_', value=T)  #--> new v4: add "SOL_" in subsequent section

  wdfc[,(wdfc.PCAs):=lapply(.SD, function(y){ y - mean(y, na.rm=T)}), by = .(ym, hour, weekday), .SDcols = wdfc.PCAs]
  wdfc[,icpt:=1]
  wdfc = wdfc[,c('dh_e','hour','mo','icpt',wdfc.PCAs), with=F]

  
  # Check for within-group variance; 
  # Catalog all of the mo-hour-PCA combinations that have no variance and drop them from the
  # model matrix by name.
  wc = melt(wdfc[,lapply(.SD, var), by=.(mo, hour), .SDcols = wdfc.PCAs], id.vars = c('mo','hour'))[value<1,][,value:=NULL]
  wc[, dropvars:=paste0('as.factor(hour)',hour,':as.factor(mo)',mo,':',variable)]
  
  
  
  # Now, make the model matrix #
  PCAs.intx = paste0('as.factor(hour):as.factor(mo):',wdfc.PCAs)  # these are the coef. FE's
  reg.form = as.formula(paste(' ~',paste(PCAs.intx, collapse=' + ')))
  wdfc.mm = model.matrix(reg.form, data=wdfc)
  wdfc.mm = wdfc.mm[,setdiff(colnames(wdfc.mm), wc[,dropvars])]  # only keep the columns (intx vars) not in dropvars (those that have zero within-variance) )
  wdfc.mm = wdfc.mm[,apply(wdfc.mm, MAR=2, var)!=0]  #--> within some cluster-hour-month groupings, there is no variance
  #  dim(wdfc.mm)                                              ###---> hmm, is this an issue for demeanlist? dropping these? No...should be OK. just dropping cols, not row.
  
  if(dim(wdfc.mm)[[1]]<=dim(wdfc.mm)[[2]]) {
    print('Not enough obs'); stop}
  

#  if('gpuR' %in% .packages(T)){
  if(exists('gpuMatrix')){
    gpu.data.type = 'float'
    if(dim(wdfc.mm)[[2]]<10000) gpu.data.type = 'double'  # change back to double!
    cat(paste0('i=',i, ' data type: ', gpu.data.type, '\n'))
    wdfc.mm.g = gpuMatrix(wdfc.mm, type=gpu.data.type)
        cat('gpuR used\n')
  } else {
    cat(paste0('i=',' non-gpu data type\n'))
    wdfc.mm.g = as.matrix(wdfc.mm)
        cat('gpuR NOT used\n')
  }
  
  cat('qr without SOL\n')
  Xs = list(qr.noSOL = qr(wdfc.mm.g))


# With-SOL ----------------------------------------------------------------

  
# if(grepl('WEST', df.iter$ymn)){ 
  if(T==T){
  hasSOL = T
  
  wdfc = df.common.intx[[jd$intx[1]]][ym%in%jd$ym,][,SOL_WSPSO:=NULL]
  wdfc[is.na(wdfc)] = 0
  wdfc.PCAs = grep(names(wdfc), pattern='PCA_|SOL_', value=T)  #--> new v4: add "SOL_"

  wdfc[,(wdfc.PCAs):=lapply(.SD, function(y){ y - mean(y, na.rm=T)}), by = .(ym, hour, weekday), .SDcols = wdfc.PCAs]
  wdfc[,icpt:=1]
  wdfc = wdfc[,c('dh_e','hour','mo','icpt',wdfc.PCAs), with=F]
  
  # Check for within-group variance; can't invert a matrix where each within-group isn't zero, right?
  # Catalog all of the mo-hour-PCA combinations that have no variance and drop them from the
  # model matrix by name.
  wc = melt(wdfc[,lapply(.SD, var), by=.(mo, hour), .SDcols = wdfc.PCAs], id.vars = c('mo','hour'))[value<1,][,value:=NULL]  # value<1 usually, but dropping dh_e where solar has no var.
  wc[, dropvars:=paste0('as.factor(hour)',hour,':as.factor(mo)',mo,':',variable)]
  
  # Now, make the model matrix #
  PCAs.intx = paste0('as.factor(hour):as.factor(mo):',wdfc.PCAs)  # these are the coef. FE's
  reg.form = as.formula(paste(' ~',paste(PCAs.intx, collapse=' + ')))
  wdfc.mm = model.matrix(reg.form, data=wdfc)
  wdfc.mm = wdfc.mm[,setdiff(colnames(wdfc.mm), wc[,dropvars])]  # only keep the columns (intx vars) not in dropvars (those that have zero within-variance) )
  wdfc.mm = wdfc.mm[,apply(wdfc.mm, MAR=2, var)!=0]  #--> within some cluster-hour-month groupings, there is no variance
  #  dim(wdfc.mm)                                              ###---> hmm, is this an issue for demeanlist? dropping these? No...should be OK. just dropping cols, not row.
  
  if(dim(wdfc.mm)[[1]]<=dim(wdfc.mm)[[2]]) {
    print('Not enough obs'); stop}

  #  if('gpuR' %in% .packages(T)){
  if(exists('gpuMatrix')){
    gpu.data.type = 'float'
    if(dim(wdfc.mm)[[2]]<10000) gpu.data.type = 'double'
    cat(paste0('i=',' data type: ', gpu.data.type, '\n'))
    wdfc.mm.g = gpuMatrix(wdfc.mm, type=gpu.data.type)
    cat('gpuR used\n')
  } else {
    cat(paste0('i=', ' non-gpu data type\n'))
    wdfc.mm.g = as.matrix(wdfc.mm)
    cat('gpuR NOT used\n')
  }
  
  cat('qr with-SOL (WEST)\n')
  Xs[['qr.SOL']] = qr(wdfc.mm.g)
  rm(wdfc.mm.g);rm(wdfc);rm(wdfc.mm)
  gc()
} # end ALL are T; all are "WEST'
  
cat('Finished QR on ');cat(df.iter$ymn);cat('\n')


coef.results = list()

for(a in 1:length(df.iter$df)){
  cat('ORISPL_');cat(df.iter$df[[a]]$ORISPL[1]);cat(' begins...\n')
    why = df.iter$df[[a]]
      Ys.vars = c('SO2_MASS','NOX_MASS','CO2_MASS','GLOAD_MASS')
      why[,(Ys.vars):=lapply(.SD, function(y){ y - mean(y, na.rm=T)}), by = .(ym), .SDcols = Ys.vars]

    for(ss in 1:length(Xs)){
      isSOL = names(Xs)[[ss]]
      cat(isSOL); cat('\n')
      cf = as.data.table(qr.coef(Xs[[ss]], as.matrix(why[,Ys.vars, with=F])),
                         keep.rownames = 'coefnames')

      cf[,c('hour','mo','PCA'):=tstrsplit(coefnames, ':')]
      cf[,hour:=tstrsplit(hour, ')', keep=2, type.convert=T)]
      cf[,  mo:=tstrsplit(mo  , ')', keep=2, type.convert=T)]
      cf[, c('ORISPL','coefnames','isSOL','ymn', 'intx'):=list(df.iter$df[[a]]$ORISPL[1],NULL, isSOL, df.iter$ymn, jd$intx[1])]
      cf = cf[order(mo, hour, PCA), .(ymn, intx, ORISPL, PCA, mo, hour, isSOL, SO2_MASS, NOX_MASS, CO2_MASS, GLOAD_MASS)]
      
    coef.results[[length(coef.results)+1]] = cf
    } # end ss loop (over noSOL/SOL)

if(a%%3==0) gc()
} # end a-loop over ORISPL
rm(Xs); gc()
return(rbindlist(coef.results, fill=T)) 
} # end dopar


saveRDS(rbindlist(RES, fill=T), file.path(WORK.OUT, 'usingBetasProcessed.rds')) 














## Useful plot checks:
dev.new()
ggplot(RES.sum %>% dplyr::filter(grepl('NWPP', PCA) & grepl('PCA', PCA)), aes(x = hour, y = GLOAD_MASS, col = isSOL)) +
 # geom_path(aes(lty = grepl('SOL', PCA)), lwd=1) + 
  geom_path(lwd=1) + 
 # coord_cartesian(ylim=c(-1, 2)) + #lims(y = c(-1, 2)) +
  facet_grid(cols = vars(mo)) + theme_bw()

for(eg in gsub('PCA_','',unique(RES.sum$PCA))){
  print(eg)
  png(file.path(WORK.OUT, paste0(eg, '.png')), width=1200*3, height=900*3, units='px', res=300)
  
  ggplot(RES.sum %>% dplyr::filter(grepl(eg, PCA) & grepl('PCA', PCA)), aes(x = hour, y = GLOAD_MASS, col = isSOL)) +
    # geom_path(aes(lty = grepl('SOL', PCA)), lwd=1) + 
    geom_path(lwd=1) + 
    # coord_cartesian(ylim=c(-1, 2)) + #lims(y = c(-1, 2)) +
    facet_grid(cols = vars(mo)) + theme_bw()
  
  dev.off()
}


  # SO2 should be around 3-6  
  # NOx should be around 1.5lbs/MWh off-pea, to 1.8lbs/MWh peak
  # CO2 should be around 1,400 to 1,700 (peak) lbs/MWh. (.7 to .85 tons)